{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# 04.01 - DATA EXPLORATION"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py\n", "import init; init.init(force_download=False); init.get_weblink()"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\n", "## Based on [Kaggle House Pricing Prediction Competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/)\n", "\n", "- Inspect and learn from the competition [Notebooks](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/notebooks)\n", "- You must make available to this notebook the `train.csv` file from the competition [data](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data) section. If running this notebook in Google Colab you must upload it in the notebook files section in Colab."]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"data": {"text/html": ["
\n", " | Id | \n", "MSSubClass | \n", "MSZoning | \n", "LotFrontage | \n", "LotArea | \n", "Street | \n", "Alley | \n", "LotShape | \n", "LandContour | \n", "Utilities | \n", "... | \n", "PoolArea | \n", "PoolQC | \n", "Fence | \n", "MiscFeature | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SaleType | \n", "SaleCondition | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "60 | \n", "RL | \n", "65.0 | \n", "8450 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2008 | \n", "WD | \n", "Normal | \n", "208500 | \n", "
1 | \n", "2 | \n", "20 | \n", "RL | \n", "80.0 | \n", "9600 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2007 | \n", "WD | \n", "Normal | \n", "181500 | \n", "
2 | \n", "3 | \n", "60 | \n", "RL | \n", "68.0 | \n", "11250 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "9 | \n", "2008 | \n", "WD | \n", "Normal | \n", "223500 | \n", "
3 | \n", "4 | \n", "70 | \n", "RL | \n", "60.0 | \n", "9550 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2006 | \n", "WD | \n", "Abnorml | \n", "140000 | \n", "
4 | \n", "5 | \n", "60 | \n", "RL | \n", "84.0 | \n", "14260 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "12 | \n", "2008 | \n", "WD | \n", "Normal | \n", "250000 | \n", "
5 rows \u00d7 81 columns
\n", "\n", " | count | \n", "mean | \n", "std | \n", "min | \n", "25% | \n", "50% | \n", "75% | \n", "max | \n", "
---|---|---|---|---|---|---|---|---|
Id | \n", "1460.0 | \n", "730.500000 | \n", "421.610009 | \n", "1.0 | \n", "365.75 | \n", "730.5 | \n", "1095.25 | \n", "1460.0 | \n", "
MSSubClass | \n", "1460.0 | \n", "56.897260 | \n", "42.300571 | \n", "20.0 | \n", "20.00 | \n", "50.0 | \n", "70.00 | \n", "190.0 | \n", "
LotFrontage | \n", "1201.0 | \n", "70.049958 | \n", "24.284752 | \n", "21.0 | \n", "59.00 | \n", "69.0 | \n", "80.00 | \n", "313.0 | \n", "
LotArea | \n", "1460.0 | \n", "10516.828082 | \n", "9981.264932 | \n", "1300.0 | \n", "7553.50 | \n", "9478.5 | \n", "11601.50 | \n", "215245.0 | \n", "
OverallQual | \n", "1460.0 | \n", "6.099315 | \n", "1.382997 | \n", "1.0 | \n", "5.00 | \n", "6.0 | \n", "7.00 | \n", "10.0 | \n", "
OverallCond | \n", "1460.0 | \n", "5.575342 | \n", "1.112799 | \n", "1.0 | \n", "5.00 | \n", "5.0 | \n", "6.00 | \n", "9.0 | \n", "
YearBuilt | \n", "1460.0 | \n", "1971.267808 | \n", "30.202904 | \n", "1872.0 | \n", "1954.00 | \n", "1973.0 | \n", "2000.00 | \n", "2010.0 | \n", "
YearRemodAdd | \n", "1460.0 | \n", "1984.865753 | \n", "20.645407 | \n", "1950.0 | \n", "1967.00 | \n", "1994.0 | \n", "2004.00 | \n", "2010.0 | \n", "
MasVnrArea | \n", "1452.0 | \n", "103.685262 | \n", "181.066207 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "166.00 | \n", "1600.0 | \n", "
BsmtFinSF1 | \n", "1460.0 | \n", "443.639726 | \n", "456.098091 | \n", "0.0 | \n", "0.00 | \n", "383.5 | \n", "712.25 | \n", "5644.0 | \n", "
BsmtFinSF2 | \n", "1460.0 | \n", "46.549315 | \n", "161.319273 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "1474.0 | \n", "
BsmtUnfSF | \n", "1460.0 | \n", "567.240411 | \n", "441.866955 | \n", "0.0 | \n", "223.00 | \n", "477.5 | \n", "808.00 | \n", "2336.0 | \n", "
TotalBsmtSF | \n", "1460.0 | \n", "1057.429452 | \n", "438.705324 | \n", "0.0 | \n", "795.75 | \n", "991.5 | \n", "1298.25 | \n", "6110.0 | \n", "
1stFlrSF | \n", "1460.0 | \n", "1162.626712 | \n", "386.587738 | \n", "334.0 | \n", "882.00 | \n", "1087.0 | \n", "1391.25 | \n", "4692.0 | \n", "
2ndFlrSF | \n", "1460.0 | \n", "346.992466 | \n", "436.528436 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "728.00 | \n", "2065.0 | \n", "
LowQualFinSF | \n", "1460.0 | \n", "5.844521 | \n", "48.623081 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "572.0 | \n", "
GrLivArea | \n", "1460.0 | \n", "1515.463699 | \n", "525.480383 | \n", "334.0 | \n", "1129.50 | \n", "1464.0 | \n", "1776.75 | \n", "5642.0 | \n", "
BsmtFullBath | \n", "1460.0 | \n", "0.425342 | \n", "0.518911 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "1.00 | \n", "3.0 | \n", "
BsmtHalfBath | \n", "1460.0 | \n", "0.057534 | \n", "0.238753 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "2.0 | \n", "
FullBath | \n", "1460.0 | \n", "1.565068 | \n", "0.550916 | \n", "0.0 | \n", "1.00 | \n", "2.0 | \n", "2.00 | \n", "3.0 | \n", "
HalfBath | \n", "1460.0 | \n", "0.382877 | \n", "0.502885 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "1.00 | \n", "2.0 | \n", "
BedroomAbvGr | \n", "1460.0 | \n", "2.866438 | \n", "0.815778 | \n", "0.0 | \n", "2.00 | \n", "3.0 | \n", "3.00 | \n", "8.0 | \n", "
KitchenAbvGr | \n", "1460.0 | \n", "1.046575 | \n", "0.220338 | \n", "0.0 | \n", "1.00 | \n", "1.0 | \n", "1.00 | \n", "3.0 | \n", "
TotRmsAbvGrd | \n", "1460.0 | \n", "6.517808 | \n", "1.625393 | \n", "2.0 | \n", "5.00 | \n", "6.0 | \n", "7.00 | \n", "14.0 | \n", "
Fireplaces | \n", "1460.0 | \n", "0.613014 | \n", "0.644666 | \n", "0.0 | \n", "0.00 | \n", "1.0 | \n", "1.00 | \n", "3.0 | \n", "
GarageYrBlt | \n", "1379.0 | \n", "1978.506164 | \n", "24.689725 | \n", "1900.0 | \n", "1961.00 | \n", "1980.0 | \n", "2002.00 | \n", "2010.0 | \n", "
GarageCars | \n", "1460.0 | \n", "1.767123 | \n", "0.747315 | \n", "0.0 | \n", "1.00 | \n", "2.0 | \n", "2.00 | \n", "4.0 | \n", "
GarageArea | \n", "1460.0 | \n", "472.980137 | \n", "213.804841 | \n", "0.0 | \n", "334.50 | \n", "480.0 | \n", "576.00 | \n", "1418.0 | \n", "
WoodDeckSF | \n", "1460.0 | \n", "94.244521 | \n", "125.338794 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "168.00 | \n", "857.0 | \n", "
OpenPorchSF | \n", "1460.0 | \n", "46.660274 | \n", "66.256028 | \n", "0.0 | \n", "0.00 | \n", "25.0 | \n", "68.00 | \n", "547.0 | \n", "
EnclosedPorch | \n", "1460.0 | \n", "21.954110 | \n", "61.119149 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "552.0 | \n", "
3SsnPorch | \n", "1460.0 | \n", "3.409589 | \n", "29.317331 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "508.0 | \n", "
ScreenPorch | \n", "1460.0 | \n", "15.060959 | \n", "55.757415 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "480.0 | \n", "
PoolArea | \n", "1460.0 | \n", "2.758904 | \n", "40.177307 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "738.0 | \n", "
MiscVal | \n", "1460.0 | \n", "43.489041 | \n", "496.123024 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.00 | \n", "15500.0 | \n", "
MoSold | \n", "1460.0 | \n", "6.321918 | \n", "2.703626 | \n", "1.0 | \n", "5.00 | \n", "6.0 | \n", "8.00 | \n", "12.0 | \n", "
YrSold | \n", "1460.0 | \n", "2007.815753 | \n", "1.328095 | \n", "2006.0 | \n", "2007.00 | \n", "2008.0 | \n", "2009.00 | \n", "2010.0 | \n", "
SalePrice | \n", "1460.0 | \n", "180921.195890 | \n", "79442.502883 | \n", "34900.0 | \n", "129975.00 | \n", "163000.0 | \n", "214000.00 | \n", "755000.0 | \n", "